Covid-19 growth across countries: Cases, Deaths and Recoveries

Data from John Hopkins repo here

In [1]:
%load_ext nb_black
import pandas as pd
import numpy as np
import io
import requests

import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import plot

import plotly.io as pio

pio.templates.default = "plotly_white"
In [2]:
def get_ts_df(df_pd_raw, col_name):
    df_pd_raw.columns = ["state", "country", "latitude", "longitude"] + [
        c for c in df_pd_raw.columns[4:]
    ]
    df_pd_ts_series = df_pd_raw.set_index(
        ["state", "country", "latitude", "longitude"]
    ).stack()
    df_pd_ts_series = pd.DataFrame(df_pd_ts_series).reset_index(drop=False)
    df_pd_ts_series.columns = [
        "state",
        "country",
        "latitude",
        "longitude",
        "timestamp",
        col_name,
    ]
    df_pd_ts_series["timestamp"] = pd.to_datetime(df_pd_ts_series["timestamp"])
    return df_pd_ts_series


def get_country_aggs(df_pd_raw, col_name):
    return (
        df_pd_raw.groupby(["country", "timestamp"])
        .agg({"latitude": "mean", "longitude": "mean", col_name: "sum"})
        .reset_index(drop=False)
    )
In [3]:
url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv"
url_deaths = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv"
url_recovered = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv"
In [4]:
s = requests.get(url).content
df_pd_raw_cases = pd.read_csv(io.StringIO(s.decode("utf-8")))

s = requests.get(url_deaths).content
df_pd_raw_deaths = pd.read_csv(io.StringIO(s.decode("utf-8")))

s = requests.get(url_recovered).content
df_pd_raw_recovered = pd.read_csv(io.StringIO(s.decode("utf-8")))
In [5]:
df_pd_cases = get_country_aggs(get_ts_df(df_pd_raw_cases, "numCases"), "numCases")
df_pd_deaths = get_country_aggs(get_ts_df(df_pd_raw_deaths, "numDeaths"), "numDeaths")
df_pd_recovered = get_country_aggs(
    get_ts_df(df_pd_raw_recovered, "numRecovered"), "numRecovered"
)
In [6]:
df_pd_country_first = (
    df_pd_cases[df_pd_cases["numCases"] > 0]
    .groupby("country")
    .agg({"timestamp": "min"})
    .reset_index(drop=False)
)
df_pd_country_first.columns = ["country", "firstTimestamp"]

df_pd_country = pd.merge(
    df_pd_cases,
    df_pd_deaths[["country", "timestamp", "numDeaths"]],
    on=["country", "timestamp"],
    how="outer",
)
df_pd_country = pd.merge(
    df_pd_country,
    df_pd_recovered[["country", "timestamp", "numRecovered"]],
    on=["country", "timestamp"],
    how="outer",
)
df_pd_country = pd.merge(df_pd_country, df_pd_country_first, on="country", how="left")
df_pd_country["daysSince"] = (
    df_pd_country["timestamp"] - df_pd_country["firstTimestamp"]
).dt.days

min_ts = df_pd_country["timestamp"].min().strftime("%Y-%m-%d")
col_days_from_beginning = f"daysFrom{min_ts}"
df_pd_country[col_days_from_beginning] = (
    df_pd_country["timestamp"] - df_pd_country["timestamp"].min()
).dt.days
In [7]:
df_pd_stacked = pd.DataFrame(
    df_pd_country.set_index(
        ["country", "timestamp", "latitude", "longitude", "firstTimestamp", "daysSince"]
    )[["numCases", "numDeaths", "numRecovered"]].stack()
).reset_index(drop=False)
df_pd_stacked.columns = [
    "country",
    "timestamp",
    "latitude",
    "longitude",
    "firstTimestamp",
    "daysSince",
    "type",
    "cases",
]

Rise of num cases from Day-1

  • Since the day-1 of cases reported, how have the number of reported cases, death rate and recover rates progressed?
  • Plot Hans Rosling graphs
In [8]:
px.line(
    df_pd_stacked[df_pd_stacked["daysSince"] >= 0],
    x="daysSince",
    y="cases",
    color="country",
    facet_row="type",
    hover_name="timestamp",
    height=1000,
    title="Increases from 1st day case was reported",
)
In [9]:
df_pd_country[df_pd_country["country"] == "US"]
Out[9]:
country timestamp latitude longitude numCases numDeaths numRecovered firstTimestamp daysSince daysFrom2020-01-22
9676 US 2020-01-22 38.499716 -92.821004 1.0 0.0 0.0 2020-01-22 0.0 0
9677 US 2020-01-23 38.499716 -92.821004 1.0 0.0 0.0 2020-01-22 1.0 1
9678 US 2020-01-24 38.499716 -92.821004 2.0 0.0 0.0 2020-01-22 2.0 2
9679 US 2020-01-25 38.499716 -92.821004 2.0 0.0 0.0 2020-01-22 3.0 3
9680 US 2020-01-26 38.499716 -92.821004 5.0 0.0 0.0 2020-01-22 4.0 4
9681 US 2020-01-27 38.499716 -92.821004 5.0 0.0 0.0 2020-01-22 5.0 5
9682 US 2020-01-28 38.499716 -92.821004 5.0 0.0 0.0 2020-01-22 6.0 6
9683 US 2020-01-29 38.499716 -92.821004 5.0 0.0 0.0 2020-01-22 7.0 7
9684 US 2020-01-30 38.499716 -92.821004 5.0 0.0 0.0 2020-01-22 8.0 8
9685 US 2020-01-31 38.499716 -92.821004 7.0 0.0 0.0 2020-01-22 9.0 9
9686 US 2020-02-01 38.499716 -92.821004 8.0 0.0 0.0 2020-01-22 10.0 10
9687 US 2020-02-02 38.499716 -92.821004 8.0 0.0 0.0 2020-01-22 11.0 11
9688 US 2020-02-03 38.499716 -92.821004 11.0 0.0 0.0 2020-01-22 12.0 12
9689 US 2020-02-04 38.499716 -92.821004 11.0 0.0 0.0 2020-01-22 13.0 13
9690 US 2020-02-05 38.499716 -92.821004 11.0 0.0 0.0 2020-01-22 14.0 14
9691 US 2020-02-06 38.499716 -92.821004 11.0 0.0 0.0 2020-01-22 15.0 15
9692 US 2020-02-07 38.499716 -92.821004 11.0 0.0 0.0 2020-01-22 16.0 16
9693 US 2020-02-08 38.499716 -92.821004 11.0 0.0 0.0 2020-01-22 17.0 17
9694 US 2020-02-09 38.499716 -92.821004 11.0 0.0 3.0 2020-01-22 18.0 18
9695 US 2020-02-10 38.499716 -92.821004 11.0 0.0 3.0 2020-01-22 19.0 19
9696 US 2020-02-11 38.499716 -92.821004 12.0 0.0 3.0 2020-01-22 20.0 20
9697 US 2020-02-12 38.499716 -92.821004 12.0 0.0 3.0 2020-01-22 21.0 21
9698 US 2020-02-13 38.499716 -92.821004 13.0 0.0 3.0 2020-01-22 22.0 22
9699 US 2020-02-14 38.499716 -92.821004 13.0 0.0 3.0 2020-01-22 23.0 23
9700 US 2020-02-15 38.499716 -92.821004 13.0 0.0 3.0 2020-01-22 24.0 24
9701 US 2020-02-16 38.499716 -92.821004 13.0 0.0 3.0 2020-01-22 25.0 25
9702 US 2020-02-17 38.499716 -92.821004 13.0 0.0 3.0 2020-01-22 26.0 26
9703 US 2020-02-18 38.499716 -92.821004 13.0 0.0 3.0 2020-01-22 27.0 27
9704 US 2020-02-19 38.499716 -92.821004 13.0 0.0 3.0 2020-01-22 28.0 28
9705 US 2020-02-20 38.499716 -92.821004 13.0 0.0 3.0 2020-01-22 29.0 29
9706 US 2020-02-21 38.499716 -92.821004 15.0 0.0 5.0 2020-01-22 30.0 30
9707 US 2020-02-22 38.499716 -92.821004 15.0 0.0 5.0 2020-01-22 31.0 31
9708 US 2020-02-23 38.499716 -92.821004 15.0 0.0 5.0 2020-01-22 32.0 32
9709 US 2020-02-24 38.499716 -92.821004 51.0 0.0 5.0 2020-01-22 33.0 33
9710 US 2020-02-25 38.499716 -92.821004 51.0 0.0 6.0 2020-01-22 34.0 34
9711 US 2020-02-26 38.499716 -92.821004 57.0 0.0 6.0 2020-01-22 35.0 35
9712 US 2020-02-27 38.499716 -92.821004 58.0 0.0 6.0 2020-01-22 36.0 36
9713 US 2020-02-28 38.499716 -92.821004 60.0 0.0 7.0 2020-01-22 37.0 37
9714 US 2020-02-29 38.499716 -92.821004 68.0 1.0 7.0 2020-01-22 38.0 38
9715 US 2020-03-01 38.499716 -92.821004 74.0 1.0 7.0 2020-01-22 39.0 39
9716 US 2020-03-02 38.499716 -92.821004 98.0 6.0 7.0 2020-01-22 40.0 40
9717 US 2020-03-03 38.499716 -92.821004 118.0 7.0 7.0 2020-01-22 41.0 41
9718 US 2020-03-04 38.499716 -92.821004 149.0 11.0 7.0 2020-01-22 42.0 42
9719 US 2020-03-05 38.499716 -92.821004 217.0 12.0 7.0 2020-01-22 43.0 43
9720 US 2020-03-06 38.499716 -92.821004 262.0 14.0 7.0 2020-01-22 44.0 44
9721 US 2020-03-07 38.499716 -92.821004 402.0 17.0 7.0 2020-01-22 45.0 45
9722 US 2020-03-08 38.499716 -92.821004 518.0 21.0 7.0 2020-01-22 46.0 46
9723 US 2020-03-09 38.499716 -92.821004 583.0 22.0 7.0 2020-01-22 47.0 47
9724 US 2020-03-10 38.499716 -92.821004 959.0 28.0 8.0 2020-01-22 48.0 48
9725 US 2020-03-11 38.499716 -92.821004 1281.0 36.0 8.0 2020-01-22 49.0 49
9726 US 2020-03-12 38.499716 -92.821004 1663.0 40.0 12.0 2020-01-22 50.0 50
9727 US 2020-03-13 38.499716 -92.821004 2179.0 47.0 12.0 2020-01-22 51.0 51
9728 US 2020-03-14 38.499716 -92.821004 2727.0 54.0 12.0 2020-01-22 52.0 52
9729 US 2020-03-15 38.499716 -92.821004 3499.0 63.0 12.0 2020-01-22 53.0 53
9730 US 2020-03-16 38.499716 -92.821004 4632.0 85.0 17.0 2020-01-22 54.0 54
9731 US 2020-03-17 38.499716 -92.821004 6421.0 108.0 17.0 2020-01-22 55.0 55
9732 US 2020-03-18 38.499716 -92.821004 7783.0 118.0 0.0 2020-01-22 56.0 56
9733 US 2020-03-19 38.499716 -92.821004 13677.0 200.0 0.0 2020-01-22 57.0 57
9734 US 2020-03-20 38.112296 -84.664082 19101.0 244.0 147.0 2020-01-22 58.0 58
In [10]:
top_countries = (
    df_pd_country.groupby("country")
    .agg({"numCases": "sum"})
    .sort_values("numCases", ascending=False)
    .head(50)
    .index.tolist()
)

# top_countries.remove("China")

df_pd = (
    df_pd_country[(df_pd_country["country"]).isin(top_countries)]
    .sort_values("timestamp")
    .copy()
)
df_pd = df_pd.fillna(1)
df_pd["deathRate"] = (df_pd["numDeaths"] / df_pd["numCases"]).fillna(0)
df_pd["ts_str"] = df_pd["timestamp"].astype(str)
df_pd["deathPer1M"] = (df_pd["numDeaths"] * 1e6 / df_pd["numCases"]).fillna(0) + 1
df_pd["numDeaths"] = df_pd["numDeaths"] + 1
df_pd["numCases"] = df_pd["numCases"] + 1
In [12]:
x = "numCases"
y = "numDeaths"
size = "deathPer1M"
min_ts = df_pd["timestamp"].min().strftime("%Y-%m-%d")
max_ts = df_pd["timestamp"].max().strftime("%Y-%m-%d")

fig = px.scatter(
    df_pd[df_pd[col_days_from_beginning] > 0].sort_values("timestamp"),
    x=x,
    y=y,
    animation_frame="ts_str",
    animation_group="country",
    size=size,
    color="country",
    hover_name="timestamp",
    range_x=[1, 200e3],
    range_y=[1, 1e4],
    size_max=55,
    color_discrete_sequence=px.colors.qualitative.Dark24,
    text="country",
    log_x=True,
    log_y=True,
    title=f"Covid-19: Top-50 countries; {min_ts}  -- {max_ts} <br> Size of the bubble = Num deaths per million",
    opacity=0.75,
)

fig.update_traces(textposition="top center", textfont=dict(size=10, color="gray"))
fig.update_layout(
    xaxis=dict(title="Total Num Cases"), yaxis=dict(title="Total Num Deaths"),
)
plot(fig, filename=f"covid.html", auto_open=True)
Out[12]:
'covid.html'

Rate of increases

In [13]:
df_pd_1day_lag = (
    df_pd_country.sort_values(["country", "timestamp"])
    .set_index(["timestamp", "country"])
    .groupby(level="country")
    .shift(1)
)[["numCases", "numDeaths", "numRecovered"]]
df_pd_1day_lag.columns = ["prevNumCases", "prevNumDeaths", "prevNumRecovered"]
In [14]:
df_pd_country = df_pd_country.set_index(["timestamp", "country"]).join(
    df_pd_1day_lag, how="left"
)
df_pd_country["diffNumCases"] = (
    df_pd_country["numCases"] - df_pd_country["prevNumCases"]
)
df_pd_country["rateIncreaseNumCases"] = (
    df_pd_country["numCases"] / df_pd_country["prevNumCases"]
)

df_pd_country["diffNumDeaths"] = (
    df_pd_country["numDeaths"] - df_pd_country["prevNumDeaths"]
)
df_pd_country["rateIncreaseNumDeaths"] = (
    df_pd_country["numDeaths"] / df_pd_country["prevNumDeaths"]
)

df_pd_country["diffNumRecovered"] = (
    df_pd_country["numRecovered"] - df_pd_country["prevNumRecovered"]
)
df_pd_country["rateIncreaseNumRecovered"] = (
    df_pd_country["numRecovered"] / df_pd_country["prevNumRecovered"]
)

df_pd_country["deathRate"] = (
    df_pd_country["numDeaths"] / df_pd_country["numCases"]
).fillna(0)
In [15]:
df_pd_country = df_pd_country.reset_index(drop=False)
In [16]:
px.line(
    df_pd_country.reset_index(drop=False),
    x="timestamp",
    y="rateIncreaseNumCases",
    color="country",
    title="Rate of increase of reported num cases between consecutive days",
)
In [17]:
px.line(
    df_pd_country.reset_index(drop=False),
    x="timestamp",
    y="deathRate",
    color="country",
    title="Death rate",
)

Exponential fits and find top countries with sharp-rise

In [20]:
from scipy import optimize
from lmfit import Model


def get_exp_fit_for_country(df_pd_country, country, metric="numCases"):
    df_pd_per_country = (
        df_pd_country[
            (df_pd_country["country"] == country) & (df_pd_country["daysSince"] > 0)
        ]
        .sort_values("daysSince")
        .copy()
    )
    exp_model = Model(lambda t, a, b, t0: a * np.exp(b * (t - t0)))

    result = exp_model.fit(
        df_pd_per_country[metric]
        + np.random.normal(0, 0.2, df_pd_per_country.shape[0]),
        t=df_pd_per_country["daysSince"],
        a=1.5,
        b=2.4,
        t0=1,
    )
    metric_camel = str(metric[0]).upper() + metric[1:]
    df_pd_per_country[f"bestFit{metric_camel}"] = result.best_fit
    return result, df_pd_per_country
In [21]:
model_results = {}
result_fit_df = {}
last_jump_dict = {}
for country in top_countries:
    print(f"Country={country}")
    result, df_pd_per_country = get_exp_fit_for_country(df_pd_country, country)
    dely = result.eval_uncertainty(sigma=3)
    df_pd_per_country["bestFitNumCasesLb"] = result.best_fit - dely
    df_pd_per_country["bestFitNumCasesUb"] = result.best_fit + dely
    df_pd_per_country["normResidual"] = (
        df_pd_per_country["numCases"] - df_pd_per_country["bestFitNumCases"]
    ) / df_pd_per_country["numCases"]
    model_results[country] = result
    result_fit_df[country] = df_pd_per_country

    last_jump_dict[country] = (
        df_pd_per_country["normResidual"].tolist()[-1]
        if df_pd_per_country.shape[0] > 10
        else None
    )
    # print(result.fit_report())
    df_pd_fig = df_pd_per_country[
        [
            "daysSince",
            "numCases",
            "bestFitNumCasesLb",
            "bestFitNumCases",
            "bestFitNumCasesUb",
        ]
    ]
    df_pd_fig.columns = [
        "daysSince",
        "actualNumCases",
        "bestFitNumCasesLb",
        "bestFitNumCases",
        "bestFitNumCasesUb",
    ]
    df_pd_fig = pd.melt(
        df_pd_fig,
        id_vars=["daysSince"],
        value_vars=[
            "actualNumCases",
            "bestFitNumCasesLb",
            "bestFitNumCases",
            "bestFitNumCasesUb",
        ],
    )
    df_pd_fig.columns = ["daysSince", "type", "numCases"]
    fig = px.line(
        df_pd_fig,
        x="daysSince",
        y="numCases",
        color="type",
        title=f"Exp-fit: {country}",
    )
    fig.show()
Country=China
Country=Italy
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py:679: RuntimeWarning:

overflow encountered in exp

C:\ProgramData\Anaconda3\lib\site-packages\lmfit\model.py:1501: RuntimeWarning:

invalid value encountered in add

Country=Iran
Country=Korea, South
Country=Spain
Country=Germany
Country=France
Country=US
Country=Switzerland
Country=Cruise Ship
Country=United Kingdom
Country=Netherlands
Country=Japan
Country=Norway
Country=Sweden
Country=Belgium
Country=Austria
Country=Denmark
Country=Malaysia
Country=Singapore
Country=Australia
Country=Canada
Country=Qatar
Country=Portugal
Country=Israel
Country=Czechia
Country=Greece
Country=Bahrain
Country=Thailand
Country=Brazil
Country=Finland
Country=Ireland
Country=Iceland
Country=Slovenia
Country=Kuwait
Country=Poland
Country=Pakistan
Country=Taiwan*
Country=Iraq
Country=Egypt
Country=Romania
Country=Estonia
Country=United Arab Emirates
Country=India
Country=Indonesia
Country=Philippines
Country=Chile
Country=Saudi Arabia
Country=Luxembourg
Country=Lebanon
In [22]:
df_country_fit_results = pd.DataFrame(
    [
        [
            country,
            model_results[country].result.bic,
            model_results[country].result.aic,
            model_results[country].result.chisqr,
            model_results[country].result.params["a"].value,
            model_results[country].result.params["b"].value,
            model_results[country].result.params["t0"].value,
            model_results[country].ndata,
            last_jump_dict[country],
        ]
        for country in model_results.keys()
    ],
    columns=[
        "country",
        "bic",
        "aic",
        "chisq",
        "paramA",
        "paramExponent",
        "paramT0",
        "numPoints",
        "normResidualLast",
    ],
).sort_values("bic")

df_pd_fig = pd.melt(
    df_country_fit_results, id_vars=["country"], value_vars=["aic", "bic"],
)
df_pd_fig.columns = ["country", "type", "metric"]
In [23]:
fig = px.scatter(
    df_country_fit_results.sort_values("paramExponent", ascending=False),
    x="paramExponent",
    y="bic",
    color="country",
    size="numPoints",
    color_discrete_sequence=px.colors.qualitative.Dark24,
    title="Exponents and BIC for countries <br>Size=num valid data points",
    opacity=0.5,
    text="country",
)

fig.update_traces(textposition="top center", textfont=dict(size=8, color="gray"))
fig.update_layout(
    xaxis=dict(title="Exponent <br> (greater exponent means sharper rise)"),
    yaxis=dict(title="BIC <br> (less BIC means more confidence in the model)"),
)

fig.show()
In [24]:
px.bar(
    df_pd_fig,
    x="country",
    y="metric",
    color="type",
    barmode="group",
    title="Model accuracy metrics",
)
In [25]:
fig = px.bar(
    df_country_fit_results.sort_values("normResidualLast"),
    x="country",
    y="normResidualLast",
    color="numPoints",
    color_continuous_scale=["lightgray", "black"],
    title="Is the previous day's jump signifying larger or smaller than expected jump? <br> Color indicates number of points used to model. Greater number of points is preferred",
)

fig.update_layout(
    yaxis=dict(
        title="Normalized residual on the last day <br> = (Actual cases - Predicted cases)/Actual Cases <br> (greater value means sharper rise than expected)"
    ),
)

fig.show()
In [ ]:
 
In [ ]: